import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns
sns.set(font_scale=2)
sns.set_style("whitegrid")
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix
from sklearn.metrics import accuracy_score, precision_score, recall_score
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from sklearn.naive_bayes import GaussianNB
from sklearn.ensemble import GradientBoostingClassifier
We will be looking at the FIFA 2018 Dataset. While this is a video game, the developers strive to make their game as accurate as possible, so this data reflects the skills of the real-life players.
Let's load the data frame using pandas
.
df = pd.read_csv("FIFA_2018.csv",encoding = "ISO-8859-1",index_col = 0, low_memory = False)
# df2 = df[df["Position"] != "GK"].copy()
# df2.drop(['GK diving',
# 'GK handling',
# 'GK kicking',
# 'GK positioning',
# 'GK reflexes'],1,inplace=True)
validation_size = 0.3
seed = 7
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=validation_size, random_state=seed)
dict_classifiers = {
"Nearest Neighbors": KNeighborsClassifier(),
"LDA": LinearDiscriminantAnalysis(),
"Gradient Boosting Classifier": GradientBoostingClassifier(),
"Random Forest": DecisionTreeClassifier(),
"Naive Bayes": GaussianNB(),
"Linear Regression": LogisticRegression(solver='liblinear', multi_class='ovr')
}
print('%30s %16s' % ("Classifier","accuracy") )
for name, clf in list(dict_classifiers.items()):
clf.fit(X_train, Y_train)
y_result = clf.predict(X_test)
acc = accuracy_score(Y_test, y_result)
print('%30s %16f' % (name, acc) )
cmat = confusion_matrix(Y_test, y_result,labels=["DEF","MID","FWD"])
print(cmat)